Install and load necessary packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(magrittr)
library(readr)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ car::recode()      masks dplyr::recode()
## ✖ MASS::select()     masks dplyr::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::some()      masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load the data

data <- read_csv("house-prices-advanced-regression-techniques/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Filter the data for specific neighborhoods & omit NA

filtered_data <- data %>%
  filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide")) %>%
  drop_na(SalePrice, GrLivArea, Neighborhood)

Summary statistics

summary_stats <- summary(filtered_data[c("SalePrice", "GrLivArea")])
summary_stats
##    SalePrice        GrLivArea   
##  Min.   : 39300   Min.   : 334  
##  1st Qu.:116000   1st Qu.:1003  
##  Median :135500   Median :1200  
##  Mean   :138062   Mean   :1302  
##  3rd Qu.:155000   3rd Qu.:1496  
##  Max.   :345000   Max.   :5642

Plotting distributions

# Histogram of Sale Prices
ggplot(filtered_data, aes(x = SalePrice)) + geom_histogram(bins = 30,fill = "blue",alpha = 0.7) +
  ggtitle("Distribution of Sale Prices")

# Histogram of Living Area
ggplot(filtered_data, aes(x = GrLivArea)) + geom_histogram(bins = 30, fill = "red",alpha = 0.7) +
  ggtitle("Distribution of Living Area")

# SalePrice vs GrLivArea scatterplot by Neighborhood
ggplot(filtered_data, aes(x = GrLivArea, y = SalePrice)) +
geom_point() +
facet_wrap(~ Neighborhood) +
labs(title = "SalePrice vs GrLivArea in Selected Neighborhoods",
x = "Living Area (GrLivArea)", y = "Sale Price")

# SalePrice vs GrLivArea Scatterplot
ggplot(filtered_data, aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
geom_point() +
labs(title = "SalePrice vs GrLivArea in Selected Neighborhoods",
x = "Living Area", y = "Sale Price") +
theme_minimal()

Fit the SLR model

model <-
  lm(SalePrice ~ GrLivArea * Neighborhood, data = filtered_data)

# Summary of the model
modelsum <- summary(model)
modelsum
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea * Neighborhood, data = filtered_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96204 -14568   -310  12601 181131 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   19971.514  12351.125   1.617  0.10672    
## GrLivArea                        87.163      9.782   8.911  < 2e-16 ***
## NeighborhoodEdwards           68381.591  13969.511   4.895 1.46e-06 ***
## NeighborhoodNAmes             54704.888  13882.334   3.941 9.69e-05 ***
## GrLivArea:NeighborhoodEdwards   -57.412     10.718  -5.357 1.48e-07 ***
## GrLivArea:NeighborhoodNAmes     -32.847     10.815  -3.037  0.00256 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28550 on 377 degrees of freedom
## Multiple R-squared:  0.4474, Adjusted R-squared:   0.44 
## F-statistic: 61.04 on 5 and 377 DF,  p-value: < 2.2e-16
# Diagnostic plots
par(mfrow = c(1, 1))
plot(model)

Influence measures

influence_measures <- cooks.distance(model)

# Plot Cook's distance
plot(influence_measures,
     type = "h",
     main = "Cook's Distance",
     ylab = "Cook's Distance")

Identify high leverage points

influential_points <-
  which(influence_measures > (4/383))
influential_points
##  19  48  64  70  90 131 140 157 164 169 180 190 205 227 234 240 250 302 314 322 
##  19  48  64  70  90 131 140 157 164 169 180 190 205 227 234 240 250 302 314 322 
## 339 360 372 
## 339 360 372

SLR plot with influential points removed

# Remove influential points 
refined_data <- filtered_data[-influential_points, ]

# New model with high leverage points removed
model2 <-
  lm(SalePrice ~ GrLivArea * Neighborhood, data = refined_data)
summodel2 <- summary(model2)

# Summary of model
summodel2
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea * Neighborhood, data = refined_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62030 -13040    981  13115  66684 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   24290.931   9651.230   2.517 0.012281 *  
## GrLivArea                        82.187      7.898  10.405  < 2e-16 ***
## NeighborhoodEdwards           43625.330  13320.955   3.275 0.001161 ** 
## NeighborhoodNAmes             58599.377  10976.283   5.339 1.68e-07 ***
## GrLivArea:NeighborhoodEdwards   -39.778     10.785  -3.688 0.000261 ***
## GrLivArea:NeighborhoodNAmes     -35.006      8.836  -3.962 9.00e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21370 on 354 degrees of freedom
## Multiple R-squared:  0.5209, Adjusted R-squared:  0.5141 
## F-statistic: 76.97 on 5 and 354 DF,  p-value: < 2.2e-16
#Plot of model
plot(model2)

Log transformation of Variables

refined_data$LogSalePrice <- log(refined_data$SalePrice)
refined_data$LogGrLivArea <- log(refined_data$GrLivArea)

Model with LogSalePrice only

logSP_model <-
  lm(LogSalePrice ~ GrLivArea * Neighborhood, data = refined_data)

# Summary of model
sumlogSP_model <- summary(logSP_model)
sumlogSP_model
## 
## Call:
## lm(formula = LogSalePrice ~ GrLivArea * Neighborhood, data = refined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68104 -0.09390  0.01498  0.11110  0.47484 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.080e+01  7.877e-02 137.165  < 2e-16 ***
## GrLivArea                      7.253e-04  6.446e-05  11.252  < 2e-16 ***
## NeighborhoodEdwards            3.914e-01  1.087e-01   3.600 0.000364 ***
## NeighborhoodNAmes              6.632e-01  8.958e-02   7.403 9.77e-13 ***
## GrLivArea:NeighborhoodEdwards -3.429e-04  8.802e-05  -3.895 0.000117 ***
## GrLivArea:NeighborhoodNAmes   -4.218e-04  7.211e-05  -5.849 1.13e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1744 on 354 degrees of freedom
## Multiple R-squared:  0.5082, Adjusted R-squared:  0.5012 
## F-statistic: 73.15 on 5 and 354 DF,  p-value: < 2.2e-16
# Plot of model
plot(logSP_model)

Model with LogLivArea only

logLA_model <-
  lm(SalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)

#Summary of model
sumlogLA_model <- summary(logLA_model)
sumlogLA_model
## 
## Call:
## lm(formula = SalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -63722 -12519    908  13151  65018 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -472619      59503  -7.943 2.65e-14 ***
## LogGrLivArea                        84613       8486   9.971  < 2e-16 ***
## NeighborhoodEdwards                215608      86789   2.484   0.0134 *  
## NeighborhoodNAmes                  144273      71922   2.006   0.0456 *  
## LogGrLivArea:NeighborhoodEdwards   -31375      12320  -2.547   0.0113 *  
## LogGrLivArea:NeighborhoodNAmes     -18354      10211  -1.798   0.0731 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21580 on 354 degrees of freedom
## Multiple R-squared:  0.5114, Adjusted R-squared:  0.5045 
## F-statistic:  74.1 on 5 and 354 DF,  p-value: < 2.2e-16
# Plot of model
plot(logLA_model)

Model with both log

logboth_model <-
  lm(LogSalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)

# Summary of model
sumlogboth_model <- summary(logboth_model)
sumlogboth_model
## 
## Call:
## lm(formula = LogSalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69620 -0.09036  0.02169  0.10294  0.45985 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       6.09842    0.47243  12.909  < 2e-16 ***
## LogGrLivArea                      0.79242    0.06737  11.762  < 2e-16 ***
## NeighborhoodEdwards               2.17461    0.68907   3.156  0.00174 ** 
## NeighborhoodNAmes                 2.65458    0.57104   4.649 4.72e-06 ***
## LogGrLivArea:NeighborhoodEdwards -0.31346    0.09781  -3.205  0.00148 ** 
## LogGrLivArea:NeighborhoodNAmes   -0.35654    0.08107  -4.398 1.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1713 on 354 degrees of freedom
## Multiple R-squared:  0.5252, Adjusted R-squared:  0.5185 
## F-statistic: 78.33 on 5 and 354 DF,  p-value: < 2.2e-16
# Plot of model
plot(logboth_model)

Internal CV Press

# Function to calculate PRESS
calculate_press <- function(model, data) {
  n <- nrow(data)
  press <- 0
  
  for (i in 1:n) {
    # Fit model without the ith observation
    model_loo <- update(model, subset = -i)
    
    # Predict the ith observation
    pred <- predict(model_loo, data[i, , drop = FALSE])
    
    # Calculate squared prediction error and add to PRESS
    press <- press + (data$LogSalePrice[i] - pred)^2
  }
  
  return(press)
}

# Calculate PRESS statistics
press_original <- calculate_press(model, refined_data)
press_logSP <- calculate_press(logSP_model, refined_data)
press_logLA <- calculate_press(logLA_model, refined_data)
press_bothlog <- calculate_press(logboth_model, refined_data)

Compare Adjusted R- squared and Internal CV Press for models

cat("Adjusted R-squared for the original model:", modelsum$adj.r.squared, "\n")
## Adjusted R-squared for the original model: 0.4400466
cat("Adjusted R-squared for the LogSalePrice model:", sumlogSP_model$adj.r.squared, "\n")
## Adjusted R-squared for the LogSalePrice model: 0.5012172
cat("Adjusted R-squared for the LogLivingArea model:", sumlogLA_model$adj.r.squared, "\n")
## Adjusted R-squared for the LogLivingArea model: 0.5044703
cat("Adjusted R-squared for the both log model:", sumlogboth_model$adj.r.squared, "\n")
## Adjusted R-squared for the both log model: 0.5185443
cat("PRESS for original model:", press_original, "\n")
## PRESS for original model: 6.782684e+12
cat("PRESS for LogSalePrice model:", press_logSP, "\n")
## PRESS for LogSalePrice model: 11.17583
cat("PRESS for LogLivingArea model:", press_logLA, "\n")
## PRESS for LogLivingArea model: 6.573203e+12
cat("PRESS for log both model:", press_bothlog, "\n")
## PRESS for log both model: 10.76139

Parameters

sumlogboth_model
## 
## Call:
## lm(formula = LogSalePrice ~ LogGrLivArea * Neighborhood, data = refined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69620 -0.09036  0.02169  0.10294  0.45985 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       6.09842    0.47243  12.909  < 2e-16 ***
## LogGrLivArea                      0.79242    0.06737  11.762  < 2e-16 ***
## NeighborhoodEdwards               2.17461    0.68907   3.156  0.00174 ** 
## NeighborhoodNAmes                 2.65458    0.57104   4.649 4.72e-06 ***
## LogGrLivArea:NeighborhoodEdwards -0.31346    0.09781  -3.205  0.00148 ** 
## LogGrLivArea:NeighborhoodNAmes   -0.35654    0.08107  -4.398 1.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1713 on 354 degrees of freedom
## Multiple R-squared:  0.5252, Adjusted R-squared:  0.5185 
## F-statistic: 78.33 on 5 and 354 DF,  p-value: < 2.2e-16
model_CI <- confint(logboth_model, level = 0.95)
model_CI
##                                       2.5 %     97.5 %
## (Intercept)                       5.1692864  7.0275447
## LogGrLivArea                      0.6599205  0.9249283
## NeighborhoodEdwards               0.8194168  3.5298045
## NeighborhoodNAmes                 1.5315274  3.7776375
## LogGrLivArea:NeighborhoodEdwards -0.5058356 -0.1210931
## LogGrLivArea:NeighborhoodNAmes   -0.5159846 -0.1971023